# topic 11 f
#  Load the functions we will need in this script
source( "../gnrnd5.R")
source( "../assess_normality.R")
source( "../pop_sd.R")
#
#  Here we are looking to answer questions about
#  whether or not a population is "normal", that is,
#  does the population distribution look like the 
#  normal probability curve?

#  The first thing to recognize is that no finite population
#  will ever be exactly normal.  The best we can do is to find 
#  populations that are approximately normal.  For example,
#  here is a population of 9500 values that is approximately
#  normal with mean about 45 and standard deviation about  11.
gnrnd5(269834949904, 1100004500)
#  look at the first 50 values
head( L1, 50)
#  look at the last 50 values
tail( L1, 50 )
#  find the mean and standard deviation
mean( L1 )
pop_sd( L1 )
# look at a historgram of the population
hist( L1 )
# look at a boxplot of the population
boxplot( L1, horizontal = TRUE)
# just to make our example a little easier, we can transform
# our population into a new one that has mean=45 and sd=11
L2 <- (L1 - mean(L1))/pop_sd(L1)  #  L2 will have mean=0
                                  #  sd=1
mean( L2 )
pop_sd( L2 )
L3 <- L2*11 + 45
#  L3 should have mean=45 and sd=11
mean( L3 )
pop_sd( L3 )

#  What are the things that we know about a true normal 
#  distribution?
#  First, it should look like the normal distribution.

#  Here is a plot of a N(45,11) distribution
       ######################################
       ##  making this plot is beyond to   ##
       ##  requirements of this course.    ##
       ######################################
x <- seq(1, 89, length=400)
hx <- dnorm(x, mean=45, sd=11)
plot(x, hx, type="l", lty=1,
     xlim=c(0,90), xaxp=c(0,90,18),
     ylim=c(0,0.05), yaxp=c(0,0.05,10),
     las=2, cex.axis=0.75,
     xlab="z value",
     ylab="Density", main="N(45,11) Distribution")
abline( v=0, lty=1, lwd=2)
abline( h=0, lty=1, lwd=2)
abline( h=seq(0.005,0.05,0.005), lty=3, col="darkgray")
abline( v=seq(5,90,5), lty=3, col="darkgray")
#
# then add a red vertical line at the place where we
#  have both the mean and the median
lines( c(45, 45),c(0, dnorm(45, mean=45, sd=11)), col="red", lw=2)
#  going down and up one standard deviation from the mean
#  would give us the x values of 45-11=34 and 45+11=56. 
#  Put green vertical liens there.
lines( c(34, 34),c(0, dnorm(34, mean=45, sd=11)),
       col="green", lw=2)
lines( c(56, 56),c(0, dnorm(56, mean=45, sd=11)),
       col="green", lw=2)
#  going down and up two standard deviations from the mean
#  would give us the x values of 45-2*11=23 and 45+2*11=67. 
#  Put blue vertical lines there.
lines( c(23, 23),c(0, dnorm(23, mean=45, sd=11)),
       col="blue", lw=2)
lines( c(67, 67),c(0, dnorm(67, mean=45, sd=11)),
       col="blue", lw=2)

# from what we know of the normal distribution the area under the
# curve to the left of the left blue line should be 
pnorm( 23, mean=45, sd=11)
#  this will also be the area to the right of the right blue line
pnorm( 67, mean=45, sd=11, lower.tail=FALSE)

#  We want to compare our population, L3, to this kind of a 
# distribution.  To do this, we will first get a sorted
#  listing of our population values
pop_sorted <- sort( L3 )
#  then we should have about 2.27% of those values that are 
#  less than 23.  There are 9500 values in our population
#  so we can find 2.27% of that number
0.0277*9500
# and how many do we have that are less than or equal to 23?

less_than_23 <- pop_sorted[ pop_sorted<=23 ]
length( less_than_23 )
#  we have 210 values but we expected 263.  Not the same
#  but not terribly off.

#  let us see how many are greater than or equal to 67
greater_than_67 <- pop_sorted[ pop_sorted>=67 ]
length( greater_than_67 )
#  we have 205 values but we expected 263.  Not the same
#  but not terribly off.

# We can do the same for the area to the left and right of
#  the green lines
pnorm( 34, mean=45, sd=11)
#  
pnorm( 56, mean=45, sd=11, lower.tail=FALSE)
#
#  so we can compute how many we would expect out our 9500
0.1586553*9500
#  and we can see how many of our values are in those ranges
less_than_34 <- pop_sorted[ pop_sorted<=34 ]
length( less_than_34 )
#  that was ridiculously close to what we expect.
greater_than_56 <- pop_sorted[ pop_sorted>=56 ]
length( greater_than_56 )
#  again, really close

# then, just for the sake of looking at things, 
#  what is the median of our population?  We expect it 
#  to be the same as the mean.
median( L3 )
#  that is really good.

#  to get a little bit more detail, let us look at
#  quantiles both in the ideal Normal case and in 
#  our population
our_points <- seq( 0.1, 0.9, 0.1)
our_points
# these two commands should give about the same results
qnorm( our_points, mean=45, sd=11)
quantile( pop_sorted, our_points )
#  This looks really good.

#  do this again, but at more percentage points
our_points <- seq( 0.05, 0.95, 0.05)
our_points
# these two commands should give about the same results
qnorm( our_points, mean=45, sd=11)
quantile( pop_sorted, our_points )
#  This looks really good.

#  we could do the same thing again, this time at percentage 
#  points 1%, 2%, 3%, ..., 99%  but that is just a lot of points,
#  many more than we want to compare.  But we could do that
#  as a graph.
our_points <- seq( 0.01, 0.99, 0.01)
our_points
# these two commands should give about the same results
x <- qnorm( our_points, mean=45, sd=11)
y <- quantile( pop_sorted, our_points )
plot( x,y )
#  If our population is a Normal distribution then the 
#  plot should look like a straight line of points.
#  This is, essentially, what our assess_normality 
#  function does.
assess_normality( L3 )
#  The difference is that our previous plot had 99 points whereas
#  the assess_normality plot has one point for each value in our 
#  population, in this case 9500 points.

#  Of course, we could have looked at a box and whisker
#  plot to see if the data looked normal
boxplot( L3, horizontal=TRUE)
#  For a Normal distribution the two boxes should be the
#  same size, and the length of whiskers should be about  
#  3 times the width of each of the two boxes, and, if there 
#  are many items in the population we do expect to see 
#  some outliers.

#  Alternatively, we could look at a histogram of the 
#  population.

hist( L3, main="L3 with the default breakpoint" )
#  But remember that histograms can change their appearance 
#  if we change the break points.
hist( L3, breaks=seq(0,90,4.5),
      main="L3 with interval width=4.5")
hist( L3, breaks=seq(0,90,3),
      main="L3 with interval width=3")
hist( L3, breaks=seq(0,90,1.5),
      main="L3 with interval width=1.5")
hist( L3, breaks=seq(0,90,1),
      main="L3 with interval width=1")


#######################################
#   Now look at a different population, this one is not 
#   normal.
gnrnd5( 294706944905, 800002500, 2300009000)
hist( L1 )
boxplot( L1, horizontal=TRUE)
assess_normality( L1 )